# Prepare Library
import pandas as pd
import numpy as np
import country_converter as coco
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')
import nltk
%matplotlib inline
# Load Dataset tahun 2023
df = pd.read_csv('../datacsv/ds_salaries.csv')
df.drop(df[['salary','salary_currency']], axis = 1, inplace = True)
print(df.shape)
df.head()
(3755, 9)
| work_year | experience_level | employment_type | job_title | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023 | SE | FT | Principal Data Scientist | 85847 | ES | 100 | ES | L |
| 1 | 2023 | MI | CT | ML Engineer | 30000 | US | 100 | US | S |
| 2 | 2023 | MI | CT | ML Engineer | 25500 | US | 100 | US | S |
| 3 | 2023 | SE | FT | Data Scientist | 175000 | CA | 100 | CA | M |
| 4 | 2023 | SE | FT | Data Scientist | 120000 | CA | 100 | CA | M |
df.isnull().sum()
work_year 0 experience_level 0 employment_type 0 job_title 0 salary_in_usd 0 employee_residence 0 remote_ratio 0 company_location 0 company_size 0 dtype: int64
df['experience_level'] = df['experience_level'].replace('EN','Entry-level/Junior')
df['experience_level'] = df['experience_level'].replace('MI','Mid-level/Intermediate')
df['experience_level'] = df['experience_level'].replace('SE','Senior-level/Expert')
df['experience_level'] = df['experience_level'].replace('EX','Executive-level/Director')
ex_level = df['experience_level'].value_counts()
fig = px.treemap(ex_level, path = [ex_level.index], values = ex_level.values,
title = 'Experience Level')
fig.show()
print('Different job designations altogether :', len(set(df['job_title'])))
Different job designations altogether : 93
top15_job_titles = df['job_title'].value_counts()[:15]
fig = px.bar(y = top15_job_titles.values, x = top15_job_titles.index,
text = top15_job_titles.values, title = 'Top 15 Job Designations')
fig.update_layout(xaxis_title = "Job Designations", yaxis_title = "Count")
fig.show()
def Freq_df(word_list):
Freq_dist_nltk = nltk.FreqDist(word_list)
df_freq = pd.DataFrame.from_dict(Freq_dist_nltk, orient = 'index')
df_freq.columns = ['Frequency']
df_freq.index.name = 'Term'
df_freq = df_freq.sort_values(by = ['Frequency'], ascending = False)
df_freq = df_freq.reset_index()
return df_freq
def Word_Cloud(data, title):
plt.figure(figsize = (20,15))
wc = WordCloud(width = 1200, height = 600, max_words = 50,
background_color = 'white',
max_font_size = 100, random_state = 42)
wc.generate_from_frequencies(data)
plt.imshow(wc)
plt.title(title)
plt.axis('off')
plt.show()
freq_df = Freq_df(df['job_title'].values.tolist())
data = dict(zip(freq_df['Term'].tolist(), freq_df['Frequency'].tolist()))
data = freq_df.set_index('Term').to_dict()['Frequency']
Word_Cloud(data , 'WordCloud of job designations')